load data

USE_DC <- FALSE

text = read.csv('/Users/roso8920/Emotive Computing Dropbox/Rosy Southwell/EML Rosy/EMLLM/info/ia_label_mapping_opt_surprisal.csv')
if (USE_DC){eeg_dir = '/Users/roso8920/Emotive Computing Dropbox/Rosy Southwell/EML Rosy/Data/EEG_processed/unfolded_FRP_reparsed_v6/n400_stats_recomputed/'
} else {eeg_dir = '/Users/roso8920/Emotive Computing Dropbox/Rosy Southwell/EML Rosy/Data/EEG_processed/unfolded_FRP_reparsed_v6/n400_stats_nodc/'
}
file_pat = '_reading_N400_stats\\.csv$'
files=list.files(path = eeg_dir, pattern= file_pat)

Loop over participants

all<-c()
i<-0
for (f in files){
    i=i+1
    df <-read.csv(paste0(eeg_dir,f))
    p <- strsplit(f,'_')
    pID <- paste(p[[1]][1],p[[1]][2],sep='_')
    df$pID <- pID
    # df<- df %>% dplyr::select(c("pID", "type","latency","fixDur","task","identifier" , "page_fixation_ix","IA_ID","n400_magnitude_CPz","n400_latency_CPz"  ))
    df<- df %>% filter(task=='reading') %>% droplevels()
    all[[i]] <- df
}
df <- do.call(rbind, all)

Merge with text info for surprisal values

delta_wf = log(min(text$word_freq[text$word_freq>0]))-1 # use the minimum word frequency for any missing word frerquencies so we can take log

text <- text %>% rename(surprisalText=opt.125m_surprisal_wholetext, surprisalPage = opt.125m_surprisal_page)

text<- text %>%mutate(logSurprisalText = log(surprisalText),
                  logSurprisalPage = log(surprisalPage),
                  logWordFreq = ifelse(word_freq>0,log(word_freq),delta_wf)
   )
df <- df %>% merge(text, on=c("IA_ID","identifier")) %>% rename(fixDur=duration)


# drop rows with no IA
df <- drop_na(df, "IA_LABEL")
# drop rows with punc
df <- df %>% filter(! IA_LABEL %in% c('.',',','-','--',';',':',"'",'?','!'))

feats <- grep("n400|p1|n1|fixDur",colnames(df), value=T)
feats_eeg <- grep("n400|p1|n1",colnames(df), value=T)

basic checks on text variables

ggplot(text,aes(x=relative_word_position, y=logSurprisalText)) + 
geom_point(alpha=0.5, size=0.5)+stat_cor(method="pearson", size=4, r.accuracy=0.01, p.accuracy=0.001)

transforms

# df<-df %>% mutate(across(contains('n400_magnitude'),
#              .fns=~log(-.x)), .names="log_neg_{.col}")
df<-df %>% mutate(
                  logSurprisalText = log(surprisalText),
                  logSurprisalPage = log(surprisalPage),
                  logWordFreq = ifelse(word_freq>0,log(word_freq),-18.98),
                  logFixDur = log(fixDur)
)
# lagged surprisal
df <- df %>% group_by(pID, identifier) %>% arrange(page_fixation_ix, .by_group=T) %>% 
  mutate(prev_surprisalText = dplyr::lag(surprisalText, n=1, default=NA),
         prev_surprisalPage = dplyr::lag(surprisalPage, n=1, default=NA),
         prev_logSurprisalText = dplyr::lag(logSurprisalText, n=1, default=NA),
         prev_logSurprisalPage = dplyr::lag(logSurprisalPage, n=1, default=NA),
         )
# refixation
df <- df %>% group_by(pID, identifier, IA_ID) %>% mutate(
  IA_fix_count = n(),
  refixation_ = ifelse(n()>1,1,0),
  refixation=as.factor(ifelse(n()>1,1,0)))

# channel averags
df <- df %>% mutate(
  p1_mean = (p1_mean_PPO9h + p1_mean_PPO10h / 2),
  n1_mean = (n1_mean_PPO9h + n1_mean_PPO10h / 2)
)

examine

df_sub <- df %>% group_by(pID) %>% summarise_if(is.numeric,mean, na.rm=T)

plots

df_long <- df %>% pivot_longer(all_of(c(feats,'logFixDur')), names_to="feature",values_to="value")
df_long <- df_long %>% mutate(channel=str_match(feature, 'n400_\\w*_([[:alnum:]]{2,5})$')[,2],
                              feature_type=str_match(feature, '(n400_\\w*)_[[:alnum:]]{2,5}$')[,2])
# compute lower and upper whiskers for each group
ylims <- df_long %>%
  group_by(feature_type) %>%
  summarise(Q1 = quantile(value, 1/100), Q3 = quantile(value, 99/100)) %>%
  ungroup()



p1<-ggplot(df_long %>% filter(feature_type=='n400_mean')) + 
  geom_density(aes(x=value, fill=channel), alpha=0.3) + xlim(-1000,1000) + ggtitle('n400_mean')
p2<-ggplot(df_long %>% filter(feature_type=='n400_min_magnitude')) + 
  geom_density(aes(x=value, fill=channel), alpha=0.3) + xlim(-1000,1000) + ggtitle('n400_min')
p3<-ggplot(df_long %>% filter(feature_type=='n400_max_magnitude')) + 
  geom_density(aes(x=value, fill=channel), alpha=0.3) + xlim(-1000,1000) + ggtitle('n400_max')
grid.arrange(p1,p2,p3)

ggplot(df)+
    geom_density(aes(x=fixDur))+
  scale_x_log10()

ggplot(df)+
    geom_density(aes(x=surprisalText))+
  scale_x_log10()

ggplot(df_long)+
    geom_violin(aes(x=value,y=factor(channel), fill=factor(channel),outliers = FALSE), alpha=0.3)+
  facet_wrap(~feature_type, scales="free") 

ggplot(df_long %>% filter(feature_type =='n400_mean'))+
    geom_point(aes(x=surprisalText,y=value, color=factor(channel)), alpha=0.05, size=0.5)+
  facet_wrap(~channel, scales="free") 

repeat some plots after transformatiions

ggplot(df,aes(x=logSurprisalText, y=logFixDur)) + 
  geom_point(size=1, alpha=0.01)+stat_cor(method="pearson")

ggplot(df_long %>% filter(feature_type =='n400_mean'),aes(x=logSurprisalText,y=value, color=factor(channel)))+
    geom_point(alpha=0.01, size=0.5)+stat_cor(method="pearson", size=2, r.accuracy=0.01, p.accuracy=0.001, label.x=-7.5, label.y=1000)+
  facet_wrap(~channel, scales="free") 

ggplot(df_long %>% filter(feature_type =='n400_max_magnitude'),aes(x=logSurprisalText,y=value, color=factor(channel)))+
    geom_point(alpha=0.01, size=0.5)+stat_cor(method="pearson", size=2, r.accuracy=0.01, p.accuracy=0.001, label.x=-7.5, label.y=1000)+
  facet_wrap(~channel, scales="free") 

ggplot(df_long %>% filter(feature_type =='n400_min_magnitude'),aes(x=logSurprisalText,y=value, color=factor(channel)))+
    geom_point(alpha=0.01, size=0.5)+stat_cor(method="pearson", size=2, r.accuracy=0.01, p.accuracy=0.001, label.x=-7.5, label.y=1000)+
  facet_wrap(~channel, scales="free") 

LME stats

predict EEG from LOG surprisal and fixDur

m.l.m <- lmer(n400_mean_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df)
m.l.min <- lmer(n400_max_magnitude_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df)
m.l.max <- lmer(n400_min_magnitude_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df)
m.l.zc <- lmer(n400_zero_crossings_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df)
m.l.p1 <-lmer(p1_mean ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df)
m.l.n1 <- lmer(n1_mean ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df)
tab_model(m.l.m, m.l.p1, m.l.n1, show.ci=F)
  n 400 mean C Pz p 1 mean n 1 mean
Predictors Estimates p Estimates p Estimates p
(Intercept) 17.14 <0.001 -54.07 <0.001 -54.93 <0.001
logFixDur -2.85 <0.001 17.61 <0.001 9.67 <0.001
logSurprisalText -0.91 <0.001 -0.88 0.019 -1.21 0.001
Random Effects
σ2 10397.76 25848.40 25986.22
τ00 98.97 pID 469.06 pID 152.63 pID
26.39 identifier 72.74 identifier 53.34 identifier
ICC 0.01 0.02 0.01
N 50 identifier 50 identifier 50 identifier
91 pID 91 pID 91 pID
Observations 104272 104272 104272
Marginal R2 / Conditional R2 0.000 / 0.012 0.003 / 0.023 0.001 / 0.009

predict fixDur from surprisal

m.d <- lmer(fixDur ~ (1|identifier) + (1|pID) + logSurprisalText, data=df)
tab_model(m.d)
  fix Dur
Predictors Estimates CI p
(Intercept) 208.40 202.58 – 214.22 <0.001
logSurprisalText 1.81 1.38 – 2.24 <0.001
Random Effects
σ2 8839.40
τ00 pID 729.69
τ00 identifier 31.32
ICC 0.08
N identifier 50
N pID 91
Observations 104272
Marginal R2 / Conditional R2 0.001 / 0.080

predict fixation fixDur from EEG features

m.eye <- lmer(fixDur ~ (1|identifier) + (1|pID) + n400_mean_CPz, data=df)
tab_model(m.eye)
  fix Dur
Predictors Estimates CI p
(Intercept) 210.22 204.41 – 216.02 <0.001
n400 mean CPz -0.01 -0.01 – -0.00 0.003
Random Effects
σ2 8844.38
τ00 pID 729.05
τ00 identifier 32.34
ICC 0.08
N identifier 50
N pID 91
Observations 104272
Marginal R2 / Conditional R2 0.000 / 0.079

predict EEG and surprisal from previous IA surprisal

mp.m <- lmer(n400_mean_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText + prev_logSurprisalText, data=df)
tab_model(mp.m)
  n 400 mean C Pz
Predictors Estimates CI p
(Intercept) 16.72 9.04 – 24.40 <0.001
logFixDur -2.71 -4.08 – -1.33 <0.001
logSurprisalText -0.87 -1.34 – -0.40 <0.001
prev logSurprisalText -0.52 -0.99 – -0.06 0.028
Random Effects
σ2 10336.85
τ00 pID 100.21
τ00 identifier 26.79
ICC 0.01
N identifier 50
N pID 91
Observations 102838
Marginal R2 / Conditional R2 0.000 / 0.013
mp.s <- lm( logSurprisalText ~ prev_logSurprisalText, data=df)
tab_model(mp.s)
  log Surprisal Text
Predictors Estimates CI p
(Intercept) 0.89 0.88 – 0.90 <0.001
prev logSurprisalText 0.15 0.14 – 0.15 <0.001
Observations 102838
R2 / R2 adjusted 0.022 / 0.022

model including bool for refixation

mr.m <- lmer(n400_mean_CPz ~ (1|identifier) + (1|pID) + logFixDur + refixation*logSurprisalText, data=df)
tab_model(mr.m)
  n 400 mean C Pz
Predictors Estimates CI p
(Intercept) 16.93 9.23 – 24.62 <0.001
logFixDur -2.86 -4.22 – -1.49 <0.001
refixation [1] 0.54 -1.08 – 2.15 0.515
logSurprisalText -0.49 -1.24 – 0.25 0.192
refixation [1] *
logSurprisalText
-0.68 -1.63 – 0.27 0.160
Random Effects
σ2 10397.75
τ00 pID 99.06
τ00 identifier 26.38
ICC 0.01
N identifier 50
N pID 91
Observations 104272
Marginal R2 / Conditional R2 0.000 / 0.012
Anova(mr.m)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: n400_mean_CPz
##                               Chisq Df Pr(>Chisq)    
## logFixDur                   16.8361  1 0.00004075 ***
## refixation                   0.0206  1  0.8859451    
## logSurprisalText            14.5204  1  0.0001387 ***
## refixation:logSurprisalText  1.9730  1  0.1601283    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_model(mr.m,type = "emm", terms=c("logSurprisalText", "refixation"))

tb<-tableone::CreateTableOne(data = df, vars = c("n400_mean_CPz", "logFixDur"), strata = 'refixation', test=T)
knitr::kable(print(tb))
##                            Stratified by refixation
##                             0              1              p      test
##   n                         36498          67774                     
##   n400_mean_CPz (mean (SD))  1.15 (105.74)  0.98 (100.75)  0.795     
##   logFixDur (mean (SD))      5.27 (0.44)    5.24 (0.49)   <0.001
0 1 p test
n 36498 67774
n400_mean_CPz (mean (SD)) 1.15 (105.74) 0.98 (100.75) 0.795
logFixDur (mean (SD)) 5.27 (0.44) 5.24 (0.49) <0.001

model only firstpass fixations

m.fp.m <- lmer(n400_mean_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df %>% filter(refixation_==1))
# m.fp.min <- lmer(n400_max_magnitude_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df %>% filter(refixation_==1))
# m.fp.max <- lmer(n400_min_magnitude_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df %>% filter(refixation_==1))
# m.fp.zc <- lmer(n400_zero_crossings_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df %>% filter(refixation_==1))
# tab_model(m.fp.m, m.fp.min, m.fp.max, m.fp.zc, show.ci=F)
m.fp.p1 <-lmer(p1_mean ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df%>% filter(refixation_==1))
m.fp.n1 <- lmer(n1_mean ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df%>% filter(refixation_==1))
tab_model(m.fp.m, m.fp.p1, m.fp.n1, show.ci=F)
  n 400 mean C Pz p 1 mean n 1 mean
Predictors Estimates p Estimates p Estimates p
(Intercept) 11.96 0.007 -54.47 <0.001 -54.40 <0.001
logFixDur -1.88 0.022 17.47 <0.001 9.52 <0.001
logSurprisalText -1.13 <0.001 -1.16 0.014 -1.31 0.006
Random Effects
σ2 10073.81 24711.30 24924.04
τ00 70.33 pID 492.04 pID 177.96 pID
21.42 identifier 62.87 identifier 45.67 identifier
ICC 0.01 0.02 0.01
N 50 identifier 50 identifier 50 identifier
91 pID 91 pID 91 pID
Observations 67774 67774 67774
Marginal R2 / Conditional R2 0.000 / 0.009 0.003 / 0.025 0.001 / 0.010

model including other lexical variables & first pass fixations only

m.lx.m <- lmer(n400_mean_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText + logWordFreq + relative_word_position, data=df %>% filter(refixation_==1))
# m.fp.min <- lmer(n400_max_magnitude_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df %>% filter(refixation_==1))
# m.fp.max <- lmer(n400_min_magnitude_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df %>% filter(refixation_==1))
# m.fp.zc <- lmer(n400_zero_crossings_CPz ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText, data=df %>% filter(refixation_==1))
# tab_model(m.fp.m, m.fp.min, m.fp.max, m.fp.zc, show.ci=F)
m.lx.p1 <-lmer(p1_mean ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText+ logWordFreq + relative_word_position, data=df%>% filter(refixation_==1))
m.lx.n1 <- lmer(n1_mean ~ (1|identifier) + (1|pID) + logFixDur + logSurprisalText+ logWordFreq + relative_word_position, data=df%>% filter(refixation_==1))
tab_model(m.lx.m, m.lx.p1, m.lx.n1, show.ci=F)
  n 400 mean C Pz p 1 mean n 1 mean
Predictors Estimates p Estimates p Estimates p
(Intercept) 7.50 0.104 -62.90 <0.001 -57.07 <0.001
logFixDur -1.95 0.018 17.39 <0.001 9.56 <0.001
logSurprisalText -1.31 <0.001 -1.28 0.010 -0.97 0.051
logWordFreq -0.33 0.013 -0.38 0.072 0.30 0.150
relative word position 4.04 0.004 11.77 <0.001 10.66 <0.001
Random Effects
σ2 10071.71 24699.86 24916.07
τ00 70.24 pID 492.01 pID 177.87 pID
21.59 identifier 62.13 identifier 45.21 identifier
ICC 0.01 0.02 0.01
N 50 identifier 50 identifier 50 identifier
91 pID 91 pID 91 pID
Observations 67774 67774 67774
Marginal R2 / Conditional R2 0.001 / 0.010 0.003 / 0.025 0.001 / 0.010

predict EEG from surprisal and fixDur with random slope of surprisal for participant

m.rs.m <- lmer(n400_mean_CPz ~ (1+logSurprisalText|pID) + (1|identifier) + logSurprisalText + logFixDur, data=df)
m.rs.min <- lmer(n400_max_magnitude_CPz ~ (1+logSurprisalText|pID) +  (1|identifier) + logFixDur + logSurprisalText, data=df)
m.rs.max <- lmer(n400_min_magnitude_CPz ~(1+logSurprisalText|pID) +  (1|identifier) + logFixDur + logSurprisalText, data=df)
tab_model(m.rs.m, m.rs.min, m.rs.max, show.ci=F)
  n 400 mean C Pz n 400 max magnitude C Pz n 400 min magnitude C Pz
Predictors Estimates p Estimates p Estimates p
(Intercept) 17.12 <0.001 95.94 <0.001 -64.66 <0.001
logSurprisalText -0.93 0.003 -1.01 0.002 -0.74 0.019
logFixDur -2.84 <0.001 -3.64 <0.001 -1.84 0.015
Random Effects
σ2 10391.52 10709.01 12203.94
τ00 112.29 pID 907.49 pID 709.26 pID
26.68 identifier 29.53 identifier 31.44 identifier
τ11 3.31 pID.logSurprisalText 4.00 pID.logSurprisalText 2.87 pID.logSurprisalText
ρ01 -0.43 pID -0.04 pID -0.06 pID
ICC 0.01 0.08 0.06
N 91 pID 91 pID 91 pID
50 identifier 50 identifier 50 identifier
Observations 104272 104272 104272
Marginal R2 / Conditional R2 0.000 / 0.013 0.000 / 0.081 0.000 / 0.058
dotplot(ranef(m.rs.m))
## $pID

## 
## $identifier

Modeling with comprehension/MW

Load Behavioural data

df_comp <- read.csv('/Users/roso8920/Emotive Computing Dropbox/Rosy Southwell/EyeMindLink/Processed/Behaviour/EML1_page_level.csv')
df_comp <- df_comp %>% rename(pID=ParticipantID) %>% 
  mutate(Page=PageNum-1,
         identifier=paste0(Text, Page))
df <- merge(df, df_comp, on=c("pID","identifier"))
df_mw <- df %>% drop_na(MW) %>% mutate(MW=as.factor(MW))

model FIXATION LEVEL N400 ~ surprisal*MW (caveat - MW is at page level)

m.mw <- lmer(n400_mean_CPz ~ (1+logSurprisalText|pID) + (1|identifier)  + logSurprisalText*MW , data=df_mw)
Anova(m.mw)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: n400_mean_CPz
##                      Chisq Df Pr(>Chisq)
## logSurprisalText    0.5168  1     0.4722
## MW                  0.5501  1     0.4583
## logSurprisalText:MW 1.2574  1     0.2621
tab_model(m.mw)
  n 400 mean C Pz
Predictors Estimates CI p
(Intercept) 3.71 -1.92 – 9.34 0.196
logSurprisalText 0.06 -1.29 – 1.40 0.934
MW [1] 2.99 -1.81 – 7.79 0.222
logSurprisalText * MW [1] -1.22 -3.36 – 0.91 0.262
Random Effects
σ2 9889.06
τ00 pID 333.43
τ00 identifier 55.66
τ11 pID.logSurprisalText 2.97
ρ01 pID -0.85
ICC 0.03
N pID 91
N identifier 20
Observations 22322
Marginal R2 / Conditional R2 0.000 / 0.033
plot_model(m.mw,type = "emm", terms=c("logSurprisalText", "MW"))

plot_model(m.mw,type = "emm", terms=c("logFixDur", "MW"))
## `logFixDur` was not found in model terms. Maybe misspelled?
## Can't compute estimated marginal means, 'emmeans::emmeans()' returned an error.
## 
## Reason: No variable named logFixDur in the reference grid
## You may try 'ggpredict()' or 'ggeffect()'.
## NULL
emmeans(m.mw, pairwise ~ MW, p.adjust = "fdr")
## $emmeans
##  MW emmean   SE  df asymp.LCL asymp.UCL
##  0    3.77 2.68 Inf    -1.476      9.01
##  1    5.47 2.90 Inf    -0.223     11.16
## 
## Degrees-of-freedom method: asymptotic 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE  df z.ratio p.value
##  0 - 1        -1.7 2.08 Inf  -0.814  0.4155
## 
## Degrees-of-freedom method: asymptotic

bobby’s mode: MW label at page level, surprosal and n400 at fixation level

# mB <- allFit(lmer(n400_mean_CPz ~ 1 + MW*logSurprisalText + (MW*logSurprisalText|pID) + (logSurprisalText|pID:identifier), data=df_mw))
mB <- lmer(n400_mean_CPz ~ 1 + MW*logSurprisalText + (MW*logSurprisalText|pID) + (logSurprisalText|pID:identifier), data=df_mw, control=lmerControl(optimizer='Nelder_Mead'))
summary(mB)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: n400_mean_CPz ~ 1 + MW * logSurprisalText + (MW * logSurprisalText |  
##     pID) + (logSurprisalText | pID:identifier)
##    Data: df_mw
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
## REML criterion at convergence: 268460.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.8943 -0.5463  0.0153  0.5626  5.7741 
## 
## Random effects:
##  Groups         Name                 Variance Std.Dev. Corr             
##  pID:identifier (Intercept)           1654.44  40.675                   
##                 logSurprisalText        18.15   4.260  -1.00            
##  pID            (Intercept)            579.29  24.068                   
##                 MW1                  11698.49 108.160   0.02            
##                 logSurprisalText        20.62   4.541  -0.19 -0.86      
##                 MW1:logSurprisalText   389.91  19.746  -0.26 -0.95  0.76
##  Residual                             9416.73  97.040                   
## Number of obs: 22322, groups:  pID:identifier, 301; pID, 91
## 
## Fixed effects:
##                      Estimate Std. Error       df t value Pr(>|t|)
## (Intercept)            4.7955     4.3513  64.0041   1.102    0.275
## MW1                    4.6060    15.1917 461.7923   0.303    0.762
## logSurprisalText       0.2108     0.9029  44.2160   0.234    0.816
## MW1:logSurprisalText  -1.8703     3.0574  18.9739  -0.612    0.548
## 
## Correlation of Fixed Effects:
##             (Intr) MW1    lgSrpT
## MW1         -0.209              
## lgSrprslTxt -0.455 -0.144       
## MW1:lgSrprT  0.046 -0.909 -0.021
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 1 negative eigenvalues
tab_model(mB)
  n 400 mean C Pz
Predictors Estimates CI p
(Intercept) 4.80 -3.73 – 13.32 0.270
MW [1] 4.61 -25.17 – 34.38 0.762
logSurprisalText 0.21 -1.56 – 1.98 0.815
MW [1] * logSurprisalText -1.87 -7.86 – 4.12 0.541
Random Effects
σ2 9416.73
τ00 pID:identifier 1654.44
τ00 pID 579.29
τ11 pID:identifier.logSurprisalText 18.15
τ11 pID.MW1 11698.49
τ11 pID.logSurprisalText 20.62
τ11 pID.MW1:logSurprisalText 389.91
ρ01 pID:identifier -1.00
ρ01 pID.MW1 0.02
ρ01 pID.logSurprisalText -0.19
ρ01 pID.MW1:logSurprisalText -0.26
ICC 0.34
N pID 91
N identifier 20
Observations 22322
Marginal R2 / Conditional R2 0.000 / 0.338

min n400

# mB <- allFit(lmer(n400_mean_CPz ~ 1 + MW*logSurprisalText + (MW*logSurprisalText|pID) + (logSurprisalText|pID:identifier), data=df_mw))
mB.min <- lmer(n400_min_magnitude_CPz ~ 1 + MW*logSurprisalText + (MW*logSurprisalText|pID) + (logSurprisalText|pID:identifier), data=df_mw, control=lmerControl(optimizer='Nelder_Mead'))
summary(mB.min)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## n400_min_magnitude_CPz ~ 1 + MW * logSurprisalText + (MW * logSurprisalText |  
##     pID) + (logSurprisalText | pID:identifier)
##    Data: df_mw
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
## REML criterion at convergence: 272324.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.8703 -0.5359  0.0344  0.5788  5.1761 
## 
## Random effects:
##  Groups         Name                 Variance   Std.Dev. Corr             
##  pID:identifier (Intercept)           1897.2779  43.558                   
##                 logSurprisalText        14.9662   3.869  -0.74            
##  pID            (Intercept)           1288.5762  35.897                   
##                 MW1                    104.7103  10.233  -0.55            
##                 logSurprisalText         0.5579   0.747  -0.64  0.84      
##                 MW1:logSurprisalText 35036.4903 187.180   0.38 -0.49 -0.44
##  Residual                            11069.8068 105.213                   
## Number of obs: 22322, groups:  pID:identifier, 301; pID, 91
## 
## Fixed effects:
##                      Estimate Std. Error       df t value            Pr(>|t|)
## (Intercept)           -75.105      5.380   98.438 -13.959 <0.0000000000000002
## MW1                     8.644      6.838  125.248   1.264               0.209
## logSurprisalText        0.130      0.782  138.419   0.166               0.868
## MW1:logSurprisalText    2.405     24.482 4951.097   0.098               0.922
##                         
## (Intercept)          ***
## MW1                     
## logSurprisalText        
## MW1:logSurprisalText    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) MW1    lgSrpT
## MW1         -0.513              
## lgSrprslTxt -0.354  0.253       
## MW1:lgSrprT  0.196 -0.050 -0.063
## optimizer (Nelder_Mead) convergence code: 4 (failure to converge in 10000 evaluations)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 1 negative eigenvalues
## failure to converge in 10000 evaluations
tab_model(mB.min)
  n 400 min magnitude C Pz
Predictors Estimates CI p
(Intercept) -75.10 -85.65 – -64.56 <0.001
MW [1] 8.64 -4.76 – 22.05 0.206
logSurprisalText 0.13 -1.40 – 1.66 0.868
MW [1] * logSurprisalText 2.41 -45.58 – 50.39 0.922
Random Effects
σ2 11069.81
τ00 pID:identifier 1897.28
τ00 pID 1288.58
τ11 pID:identifier.logSurprisalText 14.97
τ11 pID.MW1 104.71
τ11 pID.logSurprisalText 0.56
τ11 pID.MW1:logSurprisalText 35036.49
ρ01 pID:identifier -0.74
ρ01 pID.MW1 -0.55
ρ01 pID.logSurprisalText -0.64
ρ01 pID.MW1:logSurprisalText 0.38
ICC 0.79
N pID 91
N identifier 20
Observations 22322
Marginal R2 / Conditional R2 0.001 / 0.790

max n400

# mB <- allFit(lmer(n400_mean_CPz ~ 1 + MW*logSurprisalText + (MW*logSurprisalText|pID) + (logSurprisalText|pID:identifier), data=df_mw))
mB.max <- lmer(n400_max_magnitude_CPz ~ 1 + MW*logSurprisalText + (MW*logSurprisalText|pID) + (logSurprisalText|pID:identifier), data=df_mw, control=lmerControl(optimizer='Nelder_Mead'))
summary(mB.max)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## n400_max_magnitude_CPz ~ 1 + MW * logSurprisalText + (MW * logSurprisalText |  
##     pID) + (logSurprisalText | pID:identifier)
##    Data: df_mw
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
## REML criterion at convergence: 269299.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -8.6498 -0.5492  0.0005  0.5668  5.7653 
## 
## Random effects:
##  Groups         Name                 Variance Std.Dev. Corr             
##  pID:identifier (Intercept)          1720.701 41.481                    
##                 logSurprisalText        0.996  0.998   -1.00            
##  pID            (Intercept)          1540.422 39.248                    
##                 MW1                   325.368 18.038   -1.00            
##                 logSurprisalText       11.519  3.394    0.36 -0.38      
##                 MW1:logSurprisalText  694.098 26.346    0.40 -0.39 -0.21
##  Residual                            9745.094 98.717                    
## Number of obs: 22322, groups:  pID:identifier, 301; pID, 91
## 
## Fixed effects:
##                      Estimate Std. Error       df t value            Pr(>|t|)
## (Intercept)           79.8938     5.5434  49.4935  14.412 <0.0000000000000002
## MW1                    4.2239     6.4477 153.9680   0.655               0.513
## logSurprisalText       0.3479     0.7790  67.8454   0.447               0.657
## MW1:logSurprisalText  -0.5603     3.6415 365.6702  -0.154               0.878
##                         
## (Intercept)          ***
## MW1                     
## logSurprisalText        
## MW1:logSurprisalText    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) MW1    lgSrpT
## MW1         -0.641              
## lgSrprslTxt -0.031  0.069       
## MW1:lgSrprT  0.228 -0.119 -0.226
## optimizer (Nelder_Mead) convergence code: 4 (failure to converge in 10000 evaluations)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 1 negative eigenvalues
## failure to converge in 10000 evaluations
tab_model(mB.max)
  n 400 max magnitude C Pz
Predictors Estimates CI p
(Intercept) 79.89 69.03 – 90.76 <0.001
MW [1] 4.22 -8.41 – 16.86 0.512
logSurprisalText 0.35 -1.18 – 1.87 0.655
MW [1] * logSurprisalText -0.56 -7.70 – 6.58 0.878
Random Effects
σ2 9745.09
τ00 pID:identifier 1720.70
τ00 pID 1540.42
τ11 pID:identifier.logSurprisalText 1.00
τ11 pID.MW1 325.37
τ11 pID.logSurprisalText 11.52
τ11 pID.MW1:logSurprisalText 694.10
ρ01 pID:identifier -1.00
ρ01 pID.MW1 -1.00
ρ01 pID.logSurprisalText 0.36
ρ01 pID.MW1:logSurprisalText 0.40
ICC 0.28
N pID 91
N identifier 20
Observations 22322
Marginal R2 / Conditional R2 0.000 / 0.279

Eye-mind-linkage metric at page level

note atanh transform for correlation when it is to be used in model

eml_page <- df %>% group_by(pID, identifier) %>% summarise(
    count=n(),
    cor_n400meanCPz_logSurprisalText = atanh(cor(logSurprisalText, n400_mean_CPz)),
    cor_n400minCPz_logSurprisalText = atanh(cor(logSurprisalText, n400_min_magnitude_CPz)),
    cor_n400maxCPz_logSurprisalText = atanh(cor(logSurprisalText, n400_max_magnitude_CPz)),
        cor_n400meanCPz_logWordFreq = atanh(cor(logWordFreq, n400_mean_CPz)),
        cor_P1_logSurprisalText = atanh(cor(logSurprisalText, p1_mean)),
            cor_N1_logSurprisalText = atanh(cor(logSurprisalText, n1_mean)),
        cor_P1_logWordFreq = atanh(cor(logWordFreq, p1_mean)),
            cor_N1_logWordFreq = atanh(cor(logWordFreq, n1_mean)),
    cor_logFixDur_logSurprisalText = atanh(cor(logSurprisalText, logFixDur)),
    meanLogSurprisalText=mean(logSurprisalText),
    meanP1=mean(p1_mean),
    meanN1=mean(n1_mean),
    meanN400=mean(n400_mean_CPz)
) %>% drop_na() %>% 
  filter(count>2) # remove instances with single or two fixation on page as correlation coeff will be 1 or -1
eml_page[Reduce(`&`, lapply(eml_page, is.finite)),]
## # A tibble: 0 × 16
## # Groups:   pID [0]
## # … with 16 variables: pID <chr>, identifier <fct>, count <int>,
## #   cor_n400meanCPz_logSurprisalText <dbl>,
## #   cor_n400minCPz_logSurprisalText <dbl>,
## #   cor_n400maxCPz_logSurprisalText <dbl>, cor_n400meanCPz_logWordFreq <dbl>,
## #   cor_P1_logSurprisalText <dbl>, cor_N1_logSurprisalText <dbl>,
## #   cor_P1_logWordFreq <dbl>, cor_N1_logWordFreq <dbl>,
## #   cor_logFixDur_logSurprisalText <dbl>, meanLogSurprisalText <dbl>, …
df_page <- left_join(eml_page, df_comp, on=c('pID', 'identifer')) 
df_page$MW = as.factor(df_page$MW)


# plot the new metrics
p1<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=cor_n400meanCPz_logSurprisalText, color=MW))
p2<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=cor_logFixDur_logSurprisalText, color=MW))
p3<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=cor_P1_logSurprisalText , color=MW))
p4<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=cor_N1_logSurprisalText , color=MW))
p5<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=cor_P1_logWordFreq , color=MW))
p6<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=cor_N1_logWordFreq , color=MW))
grid.arrange(p1,p2,p3,p4, p5,p6)

p1<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=meanP1, color=MW))
p2<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=meanN1, color=MW))
p3<-ggplot(df_page %>% drop_na(MW))+
  geom_density( aes(x=meanN400 , color=MW))
grid.arrange(p1,p2,p3)

# plot by participantID
# ggplot(df_page, aes(x=cor_n400meanCPz_logSurprisalText, y=pID)) +
#   geom_boxplot(  horizontal = TRUE,        outlier.colour="red",
#         outlier.fill="red",
#         outlier.size=0.2)

binomial regression: predict page level MW from page level correlations with surprisal

# m.mw.mean <- glmer(MW ~ cor_n400meanCPz_logSurprisalText + (1|identifier) + (1+cor_n400meanCPz_logSurprisalText|pID), data=df_page %>% drop_na(MW), family = "binomial")
# plot_model(m.mw.mean,type = "emm", terms=c("cor_n400meanCPz_logSurprisalText"))
# 
# m.mw.min <- glmer(MW ~ cor_n400minCPz_logSurprisalText  + (1|identifier) + (1+cor_n400minCPz_logSurprisalText|pID), data=df_page %>% drop_na(MW), family = "binomial")
# plot_model(m.mw.min,type = "emm", terms=c("cor_n400minCPz_logSurprisalText"))
# 
# m.mw.max <- glmer(MW ~ cor_n400maxCPz_logSurprisalText  + (1|identifier) + (1+cor_n400maxCPz_logSurprisalText|pID), data=df_page %>% drop_na(MW), family = "binomial")
# plot_model(m.mw.max,type = "emm", terms=c("cor_n400maxCPz_logSurprisalText"))

m.mw.all <- glmer(MW ~ 
                    cor_P1_logWordFreq + cor_P1_logSurprisalText +  cor_N1_logWordFreq  + cor_N1_logSurprisalText +
                    cor_n400meanCPz_logWordFreq + cor_n400meanCPz_logSurprisalText   + 
                    (1|identifier) + (1|pID), data=df_page %>% drop_na(MW), family = "binomial")
m.mw.n400 <- glmer(MW ~ 
                    # cor_P1_logWordFreq + cor_P1_logSurprisalText +  cor_N1_logWordFreq  + cor_N1_logSurprisalText +
                    cor_n400meanCPz_logWordFreq + cor_n400meanCPz_logSurprisalText   + 
                    (1|identifier) + (1|pID), data=df_page %>% drop_na(MW), family = "binomial")
m.mw.main <- glmer(MW ~ 
                    # cor_P1_logWordFreq + cor_P1_logSurprisalText +  cor_N1_logWordFreq  + cor_N1_logSurprisalText +
                    meanP1 + meanN1   + meanN400+
                    (1|identifier) + (1|pID), data=df_page %>% drop_na(MW), family = "binomial")
# tab_model(m.mw.mean, m.mw.min, m.mw.max)
# tab_model(m.mw.mean)
tab_model(m.mw.all)
  MW
Predictors Odds Ratios CI p
(Intercept) 0.45 0.24 – 0.83 0.012
cor P1 logWordFreq 0.26 0.02 – 4.52 0.359
cor P1 logSurprisalText 0.12 0.01 – 1.70 0.117
cor N1 logWordFreq 1.00 0.09 – 11.56 0.997
cor N1 logSurprisalText 1.69 0.19 – 14.63 0.635
cor n400meanCPz
logWordFreq
1.13 0.22 – 5.86 0.888
cor n400meanCPz
logSurprisalText
0.24 0.05 – 1.15 0.075
Random Effects
σ2 3.29
τ00 pID 1.46
τ00 identifier 0.95
ICC 0.42
N identifier 20
N pID 91
Observations 287
Marginal R2 / Conditional R2 0.066 / 0.461
tab_model(m.mw.n400)
  MW
Predictors Odds Ratios CI p
(Intercept) 0.47 0.27 – 0.84 0.010
cor n400meanCPz
logWordFreq
1.51 0.43 – 5.24 0.520
cor n400meanCPz
logSurprisalText
0.35 0.09 – 1.29 0.114
Random Effects
σ2 3.29
τ00 pID 1.28
τ00 identifier 0.74
ICC 0.38
N identifier 20
N pID 91
Observations 287
Marginal R2 / Conditional R2 0.040 / 0.405
tab_model(m.mw.main)
  MW
Predictors Odds Ratios CI p
(Intercept) 0.64 0.64 – 0.64 <0.001
meanP1 0.99 0.99 – 0.99 <0.001
meanN1 1.01 1.00 – 1.01 <0.001
meanN400 1.00 1.00 – 1.01 0.001
Random Effects
σ2 3.29
τ00 pID 1.28
τ00 identifier 0.68
ICC 0.37
N identifier 20
N pID 91
Observations 287
Marginal R2 / Conditional R2 0.021 / 0.386
# emmeans(m.mw.mean, pairwise ~ cor_n400meanCPz_logSurprisalText, p.adjust = "fdr")

Does page-average surprisal predict MW

m.mw.sp <- glmer(MW ~ meanLogSurprisalText  +  (1+meanLogSurprisalText|pID), data=df_page %>% drop_na(MW), family = "binomial")
tab_model(m.mw.sp)
  MW
Predictors Odds Ratios CI p
(Intercept) 0.25 0.08 – 0.81 0.020
meanLogSurprisalText 1.96 0.61 – 6.24 0.256
Random Effects
σ2 3.29
τ00 pID 1.91
τ11 pID.meanLogSurprisalText 4.98
ρ01 pID -0.93
ICC 0.36
N pID 91
Observations 287
Marginal R2 / Conditional R2 0.010 / 0.367